Визуализация данных на Python

In [5]:
import numpy as np
import pandas as pd
In [ ]:
 
In [137]:
data = pd.read_csv('populations.txt', sep='\t')
data.head()
Out[137]:
year hare lynx carrot
0 1900 30000.0 4000.0 48300
1 1901 47200.0 6100.0 48200
2 1902 70200.0 9800.0 41500
3 1903 77400.0 35200.0 38200
4 1904 36300.0 59400.0 40600

Библиотеки визуализации данных

In [7]:
import matplotlib
import matplotlib.pyplot as plt 
import matplotlib.mlab as mlab
%matplotlib inline
In [8]:
fig, ax1 = plt.subplots(1, 1)
In [9]:
fig, ax = plt.subplots(2, 2, figsize=(12,10))
In [10]:
from matplotlib.ticker import ScalarFormatter, FormatStrFormatter

def offset_off(x):
    x.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
In [11]:
fig, ax = plt.subplots(1,1)
ax.plot(data['year'], data['hare'])
offset_off(ax)
In [12]:
fig, ax = plt.subplots(2, 2, figsize = (12,8))

ax[0][0].plot(data['year'], data['hare'], color ='#8c92ac')
offset_off(ax[0][0])

ax[0][1].plot(data['year'], data['lynx'], color = '#ffa500')
offset_off(ax[0][1])

ax[1][0].plot(data['year'], data['carrot'], color = '#b06500')
offset_off(ax[1][0])

ax[1][1].plot(data['year'], data['hare'], label = 'Hares', color='#8c92ac', ls = ':'); 
ax[1][1].plot(data['year'], data['lynx'], label = 'Lynxes', color='#ffa500', ls = '--');
ax[1][1].plot(data['year'], data['carrot'], label = 'Carrots', color='#b06500', ls = '-');
offset_off(ax[1][1])

Упражнение

Реализовать отображение графика с четырьмя подокнами. На первых трех необходимо отобразить по отдельности популяции зайцев, рысей, моркови, и на четвертом отобразить их всех вместе.

Проработать внешний вид графиков - data-ink ratio, согласованность, ясность - что есть что - , целостность и тд. Например, убедиться, что про каждый график известно, к чему он относится - зайцам, рысям или моркови. А так же убедиться, что для одних и тех же объектов используются одни и те же цвета.

Решение

In [13]:
fig, ax1 = plt.subplots(2, 2, figsize=(12,8))

ax1[0][0].set_xlabel('Time', fontsize = 10)
ax1[0][0].set_ylabel('Hares', fontsize = 10)

ax1[0][0].plot(data['year'], data['hare'], color='#8c92ac', ls = ':'); 

ax1[0][0].spines['right'].set_visible(False)
ax1[0][0].spines['top'].set_visible(False)

ax1[0][0].yaxis.set_ticks_position('left')
ax1[0][0].xaxis.set_ticks_position('bottom')

for axis in ['top','bottom','left','right']:
    ax1[0][0].spines[axis].set_linewidth(0.5)

#Lynxes
ax1[0][1].set_xlabel('Time', fontsize = 10)
ax1[0][1].set_ylabel('Lynxes', fontsize = 10)

ax1[0][1].plot(data['year'], data['lynx'], color='#b06500', ls = '-'); 

ax1[0][1].spines['right'].set_visible(False)
ax1[0][1].spines['top'].set_visible(False)

ax1[0][1].yaxis.set_ticks_position('left')
ax1[0][1].xaxis.set_ticks_position('bottom')

for axis in ['top','bottom','left','right']:
    ax1[0][1].spines[axis].set_linewidth(0.5)
    
#Carrots
ax1[1][0].set_xlabel('Time', fontsize = 10)
ax1[1][0].set_ylabel('Carrots', fontsize = 10)

ax1[1][0].plot(data['year'], data['carrot'], color='#ffa500', ls = '--'); 

ax1[1][0].spines['right'].set_visible(False)
ax1[1][0].spines['top'].set_visible(False)

ax1[1][0].yaxis.set_ticks_position('left')
ax1[1][0].xaxis.set_ticks_position('bottom')

for axis in ['top','bottom','left','right']:
    ax1[1][0].spines[axis].set_linewidth(0.5)
    
# All of them
ax1[1][1].set_xlabel('Time', fontsize = 10)
ax1[1][1].set_ylabel('Trends in the Forest', fontsize = 10)

ax1[1][1].plot(data['year'], data['hare'], label = 'Hares', color='#8c92ac', ls = ':'); 
ax1[1][1].plot(data['year'], data['carrot'], label = 'Carrots', color='#ffa500', ls = '--');
ax1[1][1].plot(data['year'], data['lynx'], label = 'Lynxes', color='#b06500', ls = '-');
ax1[1][1].legend(loc=1, fontsize=10, frameon=False) # upper left corner


ax1[1][1].spines['right'].set_visible(False)
ax1[1][1].spines['top'].set_visible(False)

ax1[1][1].yaxis.set_ticks_position('left')
ax1[1][1].xaxis.set_ticks_position('bottom')

for axis in ['top','bottom','left','right']:
    ax1[1][1].spines[axis].set_linewidth(0.5)
 
offset_off(ax1[0][0])
offset_off(ax1[0][1])
offset_off(ax1[1][1])
offset_off(ax1[1][0])

fig.tight_layout()

График можно сохранить в виде файла:

In [14]:
fig.savefig("my_new_plot.png") 

Доступные форматы, какие из них гарантируют сохранение лучшего качества?

Matplotlib может сгенерировать результат высокого качества в разных форматах, в т.ч. PNG, JPG, EPS, SVG, PDF. Для научных статей рекомендуем использовать PDF везде, где это возможно. (В документы LaTeX, собираемые с помощью pdflatex, PDF изображения можно включасть с помощью команды includegraphics).

EPS, PDF, SVG - векторные форматы, что означает возможность редактирования изображения в программах подобных Adobe illustrator с сохранением возможности редактирования отдельных элементов изображения - линий, точек, текста и пр.

PNG, JPG - растровые форматы, как фото. В программах редактирования изображений, например, Adobe Illustrator, обычно доступен только один объект для редактирования.

pandas

https://pandas.pydata.org/pandas-docs/stable/visualization.html

  • ‘bar’ or ‘barh’ for bar plots
  • ‘hist’ for histogram
  • ‘box’ for boxplot
  • ‘kde’ or 'density' for density plots
  • ‘area’ for area plots
  • ‘scatter’ for scatter plots
  • ‘hexbin’ for hexagonal bin plots
  • ‘pie’ for pie plots

Основные возможности Pandas

https://pandas.pydata.org/pandas-docs/stable/10min.html

In [15]:
dates = pd.date_range('20130101', periods=6)
dates
Out[15]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
In [16]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
Out[16]:
A B C D
2013-01-01 -2.574077 -1.653252 -0.145710 0.879204
2013-01-02 -1.232259 -0.445670 0.265512 0.824921
2013-01-03 0.006872 0.284348 -0.046508 1.228680
2013-01-04 -1.625315 0.237181 0.686468 -0.011519
2013-01-05 0.749854 1.254197 1.049208 -2.998859
2013-01-06 -1.938103 1.105713 -1.271008 0.752137

view

In [17]:
df.head()
Out[17]:
A B C D
2013-01-01 -2.574077 -1.653252 -0.145710 0.879204
2013-01-02 -1.232259 -0.445670 0.265512 0.824921
2013-01-03 0.006872 0.284348 -0.046508 1.228680
2013-01-04 -1.625315 0.237181 0.686468 -0.011519
2013-01-05 0.749854 1.254197 1.049208 -2.998859
In [18]:
df.tail()
Out[18]:
A B C D
2013-01-02 -1.232259 -0.445670 0.265512 0.824921
2013-01-03 0.006872 0.284348 -0.046508 1.228680
2013-01-04 -1.625315 0.237181 0.686468 -0.011519
2013-01-05 0.749854 1.254197 1.049208 -2.998859
2013-01-06 -1.938103 1.105713 -1.271008 0.752137
In [19]:
df.index
Out[19]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
In [20]:
df.columns
Out[20]:
Index(['A', 'B', 'C', 'D'], dtype='object')
In [21]:
df.describe()
Out[21]:
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -1.102171 0.130420 0.089661 0.112427
std 1.250269 1.073548 0.804138 1.577739
min -2.574077 -1.653252 -1.271008 -2.998859
25% -1.859906 -0.274957 -0.120909 0.179395
50% -1.428787 0.260765 0.109502 0.788529
75% -0.302911 0.900372 0.581229 0.865633
max 0.749854 1.254197 1.049208 1.228680

select

In [22]:
df['A']
Out[22]:
2013-01-01   -2.574077
2013-01-02   -1.232259
2013-01-03    0.006872
2013-01-04   -1.625315
2013-01-05    0.749854
2013-01-06   -1.938103
Freq: D, Name: A, dtype: float64
In [23]:
df[0:3]
Out[23]:
A B C D
2013-01-01 -2.574077 -1.653252 -0.145710 0.879204
2013-01-02 -1.232259 -0.445670 0.265512 0.824921
2013-01-03 0.006872 0.284348 -0.046508 1.228680
In [24]:
df[df.A > 0]
Out[24]:
A B C D
2013-01-03 0.006872 0.284348 -0.046508 1.228680
2013-01-05 0.749854 1.254197 1.049208 -2.998859
In [25]:
df[(df.A > 0) & (df.B < 0)]
Out[25]:
A B C D
In [26]:
df[df > 0]
Out[26]:
A B C D
2013-01-01 NaN NaN NaN 0.879204
2013-01-02 NaN NaN 0.265512 0.824921
2013-01-03 0.006872 0.284348 NaN 1.228680
2013-01-04 NaN 0.237181 0.686468 NaN
2013-01-05 0.749854 1.254197 1.049208 NaN
2013-01-06 NaN 1.105713 NaN 0.752137
In [27]:
s = pd.Series(np.nan, index=[49,48,47,46,45, 1, 2, 3, 4, 5])
s
Out[27]:
49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: float64
In [28]:
s.iloc[:3]
Out[28]:
49   NaN
48   NaN
47   NaN
dtype: float64
In [29]:
s.loc[:3]
Out[29]:
49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

set

In [30]:
df.at[dates[0],'A'] = 0
df
Out[30]:
A B C D
2013-01-01 0.000000 -1.653252 -0.145710 0.879204
2013-01-02 -1.232259 -0.445670 0.265512 0.824921
2013-01-03 0.006872 0.284348 -0.046508 1.228680
2013-01-04 -1.625315 0.237181 0.686468 -0.011519
2013-01-05 0.749854 1.254197 1.049208 -2.998859
2013-01-06 -1.938103 1.105713 -1.271008 0.752137
In [31]:
df[df.A < 0] = 1
df
Out[31]:
A B C D
2013-01-01 0.000000 -1.653252 -0.145710 0.879204
2013-01-02 1.000000 1.000000 1.000000 1.000000
2013-01-03 0.006872 0.284348 -0.046508 1.228680
2013-01-04 1.000000 1.000000 1.000000 1.000000
2013-01-05 0.749854 1.254197 1.049208 -2.998859
2013-01-06 1.000000 1.000000 1.000000 1.000000
In [32]:
df['E'] = 1
df
Out[32]:
A B C D E
2013-01-01 0.000000 -1.653252 -0.145710 0.879204 1
2013-01-02 1.000000 1.000000 1.000000 1.000000 1
2013-01-03 0.006872 0.284348 -0.046508 1.228680 1
2013-01-04 1.000000 1.000000 1.000000 1.000000 1
2013-01-05 0.749854 1.254197 1.049208 -2.998859 1
2013-01-06 1.000000 1.000000 1.000000 1.000000 1

calculate

In [33]:
df.mean()
Out[33]:
A    0.626121
B    0.480882
C    0.642832
D    0.351504
E    1.000000
dtype: float64
In [34]:
df.max()
Out[34]:
A    1.000000
B    1.254197
C    1.049208
D    1.228680
E    1.000000
dtype: float64
In [35]:
df['A'].apply(lambda x: x + 1)
Out[35]:
2013-01-01    1.000000
2013-01-02    2.000000
2013-01-03    1.006872
2013-01-04    2.000000
2013-01-05    1.749854
2013-01-06    2.000000
Freq: D, Name: A, dtype: float64

join

In [36]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
In [37]:
left
Out[37]:
key lval
0 foo 1
1 foo 2
In [38]:
right
Out[38]:
key rval
0 foo 4
1 foo 5
In [39]:
pd.merge(left, right, on='key')
Out[39]:
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5

group

In [40]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df
Out[40]:
A B C D
0 foo one 0.507651 0.735104
1 bar one -0.037151 1.156442
2 foo two -0.368605 -0.137371
3 bar three 0.341077 1.588245
4 foo two 0.803174 -0.194649
5 bar two -1.225766 0.421135
6 foo one -0.916734 0.832990
7 foo three -0.806171 1.549621
In [41]:
def func(df, ind, col):
    return df[col].loc[ind][0]

df.groupby(lambda x: func(df, x, 'A')).sum()
Out[41]:
C D
b -0.921840 3.165821
f -0.780684 2.785695
In [42]:
df.groupby(['A','B']).sum()
Out[42]:
C D
A B
bar one -0.037151 1.156442
three 0.341077 1.588245
two -1.225766 0.421135
foo one -0.409082 1.568094
three -0.806171 1.549621
two 0.434569 -0.332020
In [43]:
dir(df.groupby(['A','B']))
Out[43]:
['A',
 'B',
 'C',
 'D',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_agg_doc',
 '_aggregate',
 '_aggregate_generic',
 '_aggregate_item_by_item',
 '_aggregate_multiple_funcs',
 '_apply_filter',
 '_apply_to_column_groupbys',
 '_apply_whitelist',
 '_assure_grouper',
 '_block_agg_axis',
 '_builtin_table',
 '_choose_path',
 '_concat_objects',
 '_constructor',
 '_cumcount_array',
 '_cython_agg_blocks',
 '_cython_agg_general',
 '_cython_table',
 '_cython_transform',
 '_decide_output_index',
 '_def_str',
 '_define_paths',
 '_dir_additions',
 '_dir_deletions',
 '_get_data_to_aggregate',
 '_get_index',
 '_get_indices',
 '_gotitem',
 '_group_selection',
 '_index_with_as_index',
 '_insert_inaxis_grouper_inplace',
 '_internal_names',
 '_internal_names_set',
 '_is_builtin_func',
 '_is_cython_func',
 '_iterate_column_groupbys',
 '_iterate_slices',
 '_make_wrapper',
 '_obj_with_exclusions',
 '_post_process_cython_aggregate',
 '_python_agg_general',
 '_python_apply_general',
 '_reindex_output',
 '_reset_cache',
 '_reset_group_selection',
 '_see_also_template',
 '_selected_obj',
 '_selection',
 '_selection_list',
 '_set_group_selection',
 '_set_result_index_ordered',
 '_shallow_copy',
 '_transform_fast',
 '_transform_general',
 '_transform_item_by_item',
 '_try_cast',
 '_wrap_agged_blocks',
 '_wrap_aggregated_output',
 '_wrap_applied_output',
 '_wrap_generic_output',
 '_wrap_transformed_output',
 'agg',
 'aggregate',
 'all',
 'any',
 'apply',
 'backfill',
 'bfill',
 'boxplot',
 'corr',
 'corrwith',
 'count',
 'cov',
 'cumcount',
 'cummax',
 'cummin',
 'cumprod',
 'cumsum',
 'describe',
 'diff',
 'dtypes',
 'expanding',
 'ffill',
 'fillna',
 'filter',
 'first',
 'get_group',
 'groups',
 'head',
 'hist',
 'idxmax',
 'idxmin',
 'indices',
 'irow',
 'last',
 'mad',
 'max',
 'mean',
 'median',
 'min',
 'name',
 'ndim',
 'ngroups',
 'nth',
 'ohlc',
 'pad',
 'pct_change',
 'plot',
 'prod',
 'quantile',
 'rank',
 'resample',
 'rolling',
 'sem',
 'shift',
 'size',
 'skew',
 'std',
 'sum',
 'tail',
 'take',
 'transform',
 'tshift',
 'var']

pivot table

In [44]:
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['a', 'b', 'c'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
df
Out[44]:
A B C D E
0 one a foo 0.400035 -1.770021
1 one b foo -0.658406 -0.513939
2 two c foo -0.404949 -2.070146
3 three a bar 0.021224 1.088869
4 one b bar -0.627138 0.230198
5 one c bar 0.406708 0.871436
6 two a foo -2.645302 -0.561619
7 three b foo 0.484094 3.583494
8 one c foo 0.366780 -0.283473
9 one a bar -0.849498 -0.102099
10 two b bar 0.284562 0.401711
11 three c bar 1.021932 1.247990
In [45]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
Out[45]:
C bar foo
A B
one a -0.849498 0.400035
b -0.627138 -0.658406
c 0.406708 0.366780
three a 0.021224 NaN
b NaN 0.484094
c 1.021932 NaN
two a NaN -2.645302
b 0.284562 NaN
c NaN -0.404949

plot

In [46]:
p = data.groupby('year').sum().plot(subplots=True, figsize=(20, 20), rot=0, sharey=True, legend=True)
offset_off(p[0])
In [47]:
data.plot.bar(x='year')
Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f733da592e8>
In [48]:
import seaborn as sns
In [49]:
sns.barplot(x='year', y='hare', data=data, palette="BuGn_d")
Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f732b8e40b8>

plotly

In [200]:
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode()
import plotly.graph_objs as go
In [201]:
hares = [
    go.Bar(
        x=data['year'],
        y=data['hare']
    )
]

iplot(hares, filename='basic-bar')

bokeh

In [52]:
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
from bokeh.charts import Bar
output_notebook()
BokehJS successfully loaded.
In [53]:
p = Bar(data, 'year', values='hare', title="Hare")
show(p)
/usr/local/lib/python3.4/dist-packages/bokeh/charts/_attributes.py:78: FutureWarning:

sort(columns=....) is deprecated, use sort_values(by=.....)

In [54]:
import pygal
In [55]:
# from IPython.display import SVG, HTML

import pygal
from IPython.display import display

bar_chart = pygal.Bar()
bar_chart.add('Hare', data['hare'].values)
bar_chart.x_labels = map(str, data['year'].values)
display({'image/svg+xml': bar_chart.render()}, raw=True)
Pygal0010000100002000020000300003000040000400005000050000600006000070000700001900190119021903190419051906190719081909191019111912191319141915191619171918191919203000027.987545787545784428.98926654719004720058.4087912087912371.29695885519017020088.83003663003662294.150268336190277400119.25128205128202270.0190336300149.67252747252746407.857781753190420600180.0937728937729460.518783542190518100210.51501831501832468.904293381190621400240.93626373626375457.835420394190722000271.3575091575091455.822898032190825400301.77875457875456444.418604651190927100332.2438.716457961191040300362.6212454212454394.440966011191157000393.04249084249085338.425760286191276600423.4637362637363272.683363148191352300453.88498168498165354.190518784191419500484.3062271062271464.208407871191511200514.7274725274724492.04830053719167600545.148717948718504.123434705191714600575.5699633699634480.644007156191816200605.9912087912088475.277280859191924700636.4124542124541446.7665474061920Hare

Виды графиков

Scatterplot

In [56]:
fig, ax1= plt.subplots(1, 1)
ax1.plot(data['year'], data['hare'], 'o');
In [57]:
data.plot.scatter(x='year', y='hare')
Out[57]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f732a8b9978>

Bar Charts (столбчатые диаграммы)

In [58]:
fig, ax1= plt.subplots(1, 1)
width=0.2
ax1.bar(data['year'], data['hare'], width);  # параметр width изменяет ширину полосы
In [59]:
data.plot.bar(x='year', y='hare')
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f732989b208>
In [60]:
fig, ax1= plt.subplots(1, 1)
width=0.8
ax1.bar(data['year'], data['hare'], width, color='#98cff4'); 
ax1.bar(data['year'], data['lynx'], width, color='#ffe4e1', bottom=data['hare']);  # Если указать bottom, полосы будут отрисованы над указанными
In [61]:
data.plot.bar(x='year', y=['hare', 'lynx', 'carrot'], stacked=True)
Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f73297252e8>

Area plots (диаграммы областей)

In [62]:
fig, ax1= plt.subplots(1, 1)

ax1.fill_between(data['year'], 0, data['hare'])
ax1.set_ylabel('Area between \n y=0 and hares')
Out[62]:
<matplotlib.text.Text at 0x7f733db5f978>

Stacked Area plots (составные диаграммы областей)

In [63]:
fig, ax1= plt.subplots(1, 1)

ax1.stackplot(data['year'], data['hare'], data['lynx'], data['carrot'])
ax1.legend(['hares','lynxes','carrots'], frameon=False,loc='upper center');
In [64]:
data.plot.area(x='year')
Out[64]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f733dfdcda0>
In [65]:
data.plot.area(x='year', stacked=False)
Out[65]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f733dd057b8>

Grouped bar charts (сгруппированные столбчатые диаграммы)

In [66]:
hares5=data['hare'][0:5]
lynxes5=data['lynx'][0:5]
new_t5=data['year'][0:5]

fig, ax1= plt.subplots(1, 1, figsize=(12,5))
bar_width=0.3
hares_bar = ax1.bar(new_t5, hares5, bar_width,
                 color='b',
                 label='Hares')

lynxes_bar = ax1.bar(new_t5 + bar_width, lynxes5, bar_width,
                 color='r',
                 label='Lynxes')

ax1.set_xlabel('Year')
ax1.set_ylabel('Population')
plt.title('Population by Species')
plt.xticks(new_t5 + bar_width, ('1900', '1901', '1902', '1903', '1904'))
plt.legend();
In [67]:
data[data['year'] < 1905][['hare', 'lynx', 'year']].plot.bar(x='year', y=['hare', 'lynx'])
Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f733dfa8518>

Круговые диаграммы

In [68]:
fig = plt.figure()
ax = plt.axes([0.025,0.025,0.95,0.95], polar=True) # This is a different way to initialize the axes of a figure

N = len(data['year'])
theta = np.arange(0.0, 2*np.pi, 2*np.pi/N) #we need to map our time data to the angles in a circle
radii = data['hare'] 
mywidth = 0.3
bars = plt.bar(theta, radii, width=mywidth, bottom=0.0)

rmax=np.max(radii)

for r,bar in zip(radii, bars): 
    bar.set_facecolor( plt.cm.Pastel1_r(r/rmax)) #We are using here the colormap, which takes as input a number between 0 and 1
    bar.set_alpha(0.5) # With this we set the transparency of the plot. Try to put it equal to 1
    
ax.set_xticks(np.pi/180*np.linspace(0,  360, N+1))
ax.set_xticklabels((data['year'].astype(int)))
ax.set_yticklabels([])
plt.show()

Distribution Plots (диаграммы распределения)

In [69]:
mu = 100.0
sigma1 = 15.0
A1 = np.random.normal(mu, sigma1, 10000) # Let's generate fake data, like IQ measurements
In [70]:
fig, ax= plt.subplots(1, 1)

# the histogram of the data
n, bins, patches = plt.hist(A1, 50, normed=1, facecolor='#368d5c', alpha=0.75)

plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)

ax.spines["top"].set_visible(False)    
ax.spines["right"].set_visible(False)    
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

plt.show()
In [71]:
fig, ax= plt.subplots(1, 1)

# the histogram of the data
n, bins, patches = plt.hist(A1, 50, normed=1, facecolor='#368d5c', alpha=0.75)

# add a 'best fit' line
y = mlab.normpdf( bins, mu, sigma1)
l = plt.plot(bins, y, 'r--', linewidth=2)

plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)

ax.spines["top"].set_visible(False)    
ax.spines["right"].set_visible(False)    
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

plt.show()
In [72]:
import seaborn as sns
p = sns.distplot(A1)

plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)

ax.spines["top"].set_visible(False)    
ax.spines["right"].set_visible(False)    
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

Density plot (диаграмма плотности распределения)

In [73]:
df = pd.DataFrame(A1)
ax = df.plot(kind='density', color = 'black')
ax.spines["top"].set_visible(False)    
ax.spines["right"].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
ax.legend().set_visible(False)

Comparing distributions (сравнение распределений)

In [74]:
mu = 100.0
sigma1 = 15.0
sigma2 = 25.0
A1 = np.random.normal(mu, sigma1, 10000) # IQ measurements of humans
A2 = np.random.normal(mu, sigma2, 10000) # IQ measurements of aliens
In [75]:
fig, ax= plt.subplots(1, 1)

# the histogram of the data
n, bins, patches = plt.hist(A1, 50, normed=1, facecolor='#368d5c', alpha=0.75)
n, bins, patches = plt.hist(A2, 50, normed=1, facecolor='#efbb38', alpha=0.75)

plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)

ax.spines["top"].set_visible(False)    
ax.spines["right"].set_visible(False)    
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

plt.show()

Boxplots (ящик с усами)

In [76]:
fig, ax= plt.subplots(1, 1)
ax.boxplot([A1, A2]);
plt.setp(ax, xticklabels=['A1', 'A2']);
In [77]:
fig, ax= plt.subplots(1, 1)
bp=ax.boxplot([A1, A2]);
plt.setp(ax, xticklabels=['A1', 'A2'])
plt.setp(bp['boxes'], color='black')
plt.setp(bp['whiskers'], color='black')
plt.setp(bp['fliers'], marker='o', MarkerFaceColor='red');
In [78]:
mu = 100.0
sigma1 = 15.0
sigma2 = 25.0
A1 = np.random.normal(mu, sigma1, 10000) # IQ measurements of humans
A2 = np.concatenate((np.random.normal(mu-50, sigma1, 10000), np.random.normal(mu+50, sigma2, 10000)), axis=0) # IQ measurements of aliens
In [79]:
fig, ax= plt.subplots(1, 1)
ax.boxplot([A1, A2])
plt.setp(ax, xticklabels=['A1', 'A2']);
In [80]:
fig, ax= plt.subplots(1, 1)
ax.violinplot([A1, A2], showmeans=False, showmedians=True);
In [81]:
df=pd.read_csv('crimeRatesByState2005.tsv',header=0,sep='\t')
In [82]:
df.head()
Out[82]:
state murder Forcible_rate Robbery aggravated_assult burglary larceny_theft motor_vehicle_theft population
0 Alabama 8.2 34.3 141.4 247.8 953.8 2650.0 288.3 4627851
1 Alaska 4.8 81.1 80.9 465.1 622.5 2599.1 391.0 686293
2 Arizona 7.5 33.8 144.4 327.4 948.4 2965.2 924.4 6500180
3 Arkansas 6.7 42.9 91.1 386.8 1084.6 2711.2 262.1 2855390
4 California 6.9 26.0 176.1 317.3 693.3 1916.5 712.8 36756666
In [83]:
state_data=df.as_matrix(columns=df.columns[1:])
state_names=np.array(df['state'])
In [84]:
fig, ax = plt.subplots(1, 1)
sc = plt.scatter(df['murder'], df['burglary'], s=df['population'] / 100000, c=df['motor_vehicle_theft'], alpha=0.5, cmap=plt.cm.get_cmap('viridis') )
plt.colorbar(sc)

# This figure is not final: you should put labels, title, units, remove the top and right axes if needed, add a grid, and so on.
# But you know how to do that!
Out[84]:
<matplotlib.colorbar.Colorbar at 0x7f733de4d828>
In [85]:
fig, ax= plt.subplots(1, 1, figsize=(10,5))
plt.scatter(df['murder'], df['burglary'], s=df['population']/100000, c=df['motor_vehicle_theft'], alpha=0.5, cmap=plt.cm.get_cmap('viridis'))

for i in range(len(state_names)):
    ax.annotate(state_names[i], (state_data[i,0] + 0.2, state_data[i,4] + 10))
plt.colorbar(); 

# The result is a bit cluttered, but it is hard to untangle the text in an 
# automatic way. We can solve by using intereactive visualization
In [86]:
fig, ax = plt.subplots(1,1, figsize = (10,5))

state_names=np.array(df['state'])
x = np.array(df['murder'])
y = np.array(df['burglary'])
area = np.array(df['population']) / 30000
colours = np.array(df['Robbery'])
text = np.array(df['state'])
 
ax.scatter(x, y, s = area, c = colours, cmap = 'inferno', alpha = 0.5, linewidth = 0)

ax.set_xlabel('Murder', fontsize = 10)
ax.set_ylabel('Bulglarly', fontsize = 10)

for i, state in enumerate(text):
    ax.annotate(state, (x[i],y[i]), fontsize = 7) 
In [ ]:
 
In [87]:
trace0 = go.Scatter(
    x=x,
    y=y,
    mode='markers',
    marker=dict(
        size=area,
        sizemode='area',
        sizemin=4,
        color=colours
    ),
    text=text
)

data = [trace0]
iplot(data, filename='bubblechart-size-ref')
In [88]:
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
output_notebook()

p = figure()
p.scatter(x, y, radius=area / 1000., fill_color='black',  fill_alpha=0.8, line_color=None)

show(p)
BokehJS successfully loaded.

Качество визуализации для перезентации

In [89]:
import seaborn as sns
sns.pairplot(df, kind="reg")
Out[89]:
<seaborn.axisgrid.PairGrid at 0x7f733740ffd0>
In [90]:
df.corr()
Out[90]:
murder Forcible_rate Robbery aggravated_assult burglary larceny_theft motor_vehicle_theft population
murder 1.000000 0.122539 0.755475 0.666560 0.623176 0.268918 0.429665 0.334635
Forcible_rate 0.122539 1.000000 -0.034069 0.445004 0.355869 0.325941 0.203256 -0.165830
Robbery 0.755475 -0.034069 1.000000 0.565577 0.430824 0.161713 0.510915 0.591129
aggravated_assult 0.666560 0.445004 0.565577 1.000000 0.622240 0.437775 0.370480 0.211076
burglary 0.623176 0.355869 0.430824 0.622240 1.000000 0.686875 0.521853 0.150310
larceny_theft 0.268918 0.325941 0.161713 0.437775 0.686875 1.000000 0.505688 -0.003594
motor_vehicle_theft 0.429665 0.203256 0.510915 0.370480 0.521853 0.505688 1.000000 0.236577
population 0.334635 -0.165830 0.591129 0.211076 0.150310 -0.003594 0.236577 1.000000

Heatmap (тепловая карта)

In [91]:
nba = pd.read_csv('nba.csv', index_col=0)
# Normalize data columns
nba_norm = (nba - nba.mean()) / (nba.max() - nba.min())
In [92]:
nba_norm.head()
Out[92]:
G MIN PTS FGM FGA FGP FTM FTA FTP 3PM 3PA 3PP ORB DRB TRB AST STL BLK TO PF
Name
Dwyane Wade 0.143158 0.233535 0.718308 0.595714 0.561296 0.106030 0.431875 0.498987 -0.143017 -0.032143 0.039429 -0.023 -0.066154 -0.079740 -0.075789 0.379375 0.447500 0.250714 0.408333 -0.068333
LeBron James 0.178246 0.142626 0.579846 0.399286 0.366852 0.095980 0.400625 0.448354 -0.101117 0.146429 0.210857 0.004 -0.014872 0.231948 0.152281 0.348125 0.239167 0.179286 0.241667 -0.318333
Kobe Bryant 0.195789 -0.008889 0.456769 0.417143 0.459444 -0.014573 0.181875 0.131899 0.111173 0.075000 0.125143 0.011 -0.066154 -0.053766 -0.058246 0.108542 0.155833 -0.035000 0.075000 -0.068333
Dirk Nowitzki 0.178246 0.142626 0.387538 0.381429 0.376111 0.045729 0.197500 0.106582 0.206145 -0.139286 -0.160571 0.019 -0.066154 0.361818 0.222456 -0.151875 -0.135833 0.072143 -0.216667 -0.110000
Danny Granger -0.067368 -0.008889 0.379846 0.185000 0.292778 -0.115075 0.197500 0.131899 0.172626 0.539286 0.496571 0.064 -0.168718 -0.014805 -0.067018 -0.120625 -0.052500 0.286429 0.033333 0.265000
In [93]:
fig, ax = plt.subplots(1, 1, figsize=(15, 12))
ax.pcolor(nba_norm, cmap=plt.cm.get_cmap('Blues'), alpha=0.8)

# put the major ticks at the middle of each cell
ax.set_yticks(np.arange(nba_norm.shape[0]) + 0.5, minor=False)
ax.set_xticks(np.arange(nba_norm.shape[1]) + 0.5, minor=False)

# # want a more natural, table-like display
ax.invert_yaxis()
ax.xaxis.tick_top()

# Set the labels
# label source:https://en.wikipedia.org/wiki/Basketball_statistics
labels = ['Games','Minutes','Points','Field goals made','Field goal attempts','Field goal percentage','Free throws made','Free throws attempts','Free throws percentage','Three-pointers made','Three-point attempt','Three-point percentage','Offensive rebounds','Defensive rebounds','Total rebounds','Assists','Steals','Blocks','Turnover','Personal foul'];

# note I could have used nba_sort.columns but made "labels" instead
ax.set_xticklabels(labels, minor=False) 
ax.set_yticklabels(nba_norm.index, minor=False)

# rotate the 
t = plt.xticks(rotation=90)
In [94]:
trace = go.Heatmap(
    z=nba_norm.as_matrix(), 
    x=labels, 
    y=nba_norm.index, 
    colorscale='Blues', 
    opacity=0.8,
)
data=[trace]
iplot(data, filename='basic-heatmap')
In [ ]:
 

Работа с геоданными

In [95]:
# Import the basemap package
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
airports = pd.read_csv('https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat', header=None, dtype=str)

airports.columns = ["id", "name", "city", "country", "code", "icao", "latitude", "longitude", "altitude", "offset", "dst", "timezone", 'dat1', 'dat2']

# Create a map on which to draw.  We're using a mercator projection, and showing the whole world.
m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')
# Draw coastlines, and the edges of the map.
m.drawcoastlines()
m.drawmapboundary()
# Convert latitude and longitude to x and y coordinates
x, y = m(list(airports["longitude"].astype(float)), list(airports["latitude"].astype(float)))
# Use matplotlib to draw the points onto the map.
m.scatter(x,y,1,marker='o',color='red')
# Show the plot.
plt.show()
In [96]:
# Make a base map with a mercator projection.  Draw the coastlines.
routes = pd.read_csv('https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat', header=None, dtype=str)
routes.columns = ["airline", "airline_id", "source", "source_id", "dest", "dest_id", "codeshare", "stops", "equipment"]

m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')
m.drawcoastlines()

# Iterate through the first 3000 rows.
for name, row in routes[:3000].iterrows():
    try:
        # Get the source and dest airports.
        source = airports[airports["id"] == row["source_id"]].iloc[0]
        dest = airports[airports["id"] == row["dest_id"]].iloc[0]
        # Don't draw overly long routes.
        if abs(float(source["longitude"]) - float(dest["longitude"])) < 90:
            # Draw a great circle between source and dest airports.
            m.drawgreatcircle(float(source["longitude"]), float(source["latitude"]), float(dest["longitude"]), float(dest["latitude"]),linewidth=1,color='b')
    except (ValueError, IndexError):
        pass
    
# Show the map.
plt.show()
In [97]:
import sklearn.datasets.california_housing as ch
dataset = ch.fetch_california_housing()

X = dataset.data
Y = dataset.target

plt.figure(figsize=(10, 10))

lllon, lllat, urlon, urlat = X[:, -1].min(), X[:, -2].min(), X[:, -1].max(), X[:, -2].max()

m = Basemap(
    llcrnrlon=lllon,
    llcrnrlat=lllat,
    urcrnrlon=urlon,
    urcrnrlat=urlat, 
    projection='merc',
    resolution='h'
)

m.drawcoastlines(linewidth=0.5)
m.drawmapboundary(fill_color='#47A4C9', zorder=1)
m.fillcontinents(color='#88D8B0',lake_color='#47A4C9', zorder=2)

parallels = np.linspace(lllat, urlat, 10)
m.drawparallels(parallels,labels=[1,0,0,0],fontsize=10)
# draw meridians
meridians = np.linspace(lllon, urlon, 10)
m.drawmeridians(meridians,labels=[0,0,0,1],fontsize=10)

colors = [plt.cm.hot(int((y - Y.min()) / (Y.max() - Y.min()) * 256)) for y in Y]
m.scatter(X[:, -1], X[:, -2], latlon=True, zorder=3, lw=0, c=colors)

plt.annotate('San Francisco', xy=(0.04, 0.5), xycoords='axes fraction', color='white', size=15)
plt.annotate('Los Angeles', xy=(0.4, 0.08), xycoords='axes fraction', color='white', size=15)

plt.show()
In [98]:
import folium


m = folium.Map(location=[45.5236, -122.6750])
m
Out[98]:
In [99]:
import folium

# Get a basic world map.
airports_map = folium.Map(location=[30, 0], zoom_start=2)
# # Draw markers on the map.
for idx, (name, row) in enumerate(airports.iterrows()):
    folium.Marker([float(row["latitude"]), float(row["longitude"])], popup=row["name"]).add_to(airports_map)
    if idx > 100:
        break
        
# Create and show the map.
airports_map
Out[99]:
In [100]:
import geoplotlib
from geoplotlib.utils import read_csv


data = read_csv('flights.csv')
geoplotlib.graph(data,
                 src_lat='lat_departure',
                 src_lon='lon_departure',
                 dest_lat='lat_arrival',
                 dest_lon='lon_arrival',
                 color='hot_r',
                 alpha=16,
                 linewidth=2)
geoplotlib.show()
In [135]:
import plotly.plotly as py
from plotly.graph_objs import *

import networkx as nx

G=nx.random_geometric_graph(200,0.125)
pos=nx.get_node_attributes(G,'pos')

dmin=1
ncenter=0
for n in pos:
    x,y=pos[n]
    d=(x-0.5)**2+(y-0.5)**2
    if d<dmin:
        ncenter=n
        dmin=d

p=nx.single_source_shortest_path_length(G,ncenter)

edge_trace = Scatter(
    x=[],
    y=[],
    line=Line(width=0.5,color='#888'),
    hoverinfo='none',
    mode='lines')

for edge in G.edges():
    x0, y0 = G.node[edge[0]]['pos']
    x1, y1 = G.node[edge[1]]['pos']
    edge_trace['x'] += [x0, x1, None]
    edge_trace['y'] += [y0, y1, None]

node_trace = Scatter(
    x=[],
    y=[],
    text=[],
    mode='markers',
    hoverinfo='text',
    marker=Marker(
        showscale=True,
        # colorscale options
        # 'Greys' | 'Greens' | 'Bluered' | 'Hot' | 'Picnic' | 'Portland' |
        # Jet' | 'RdBu' | 'Blackbody' | 'Earth' | 'Electric' | 'YIOrRd' | 'YIGnBu'
        colorscale='YIGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line=dict(width=2)))

for node in G.nodes():
    x, y = G.node[node]['pos']
    node_trace['x'].append(x)
    node_trace['y'].append(y)
    
    
for node, adjacencies in enumerate(G.adjacency()):
    node_trace['marker']['color'].append(len(adjacencies))
    node_info = '# of connections: '+str(len(adjacencies))
    node_trace['text'].append(node_info)
    
fig = Figure(data=Data([edge_trace, node_trace]),
             layout=Layout(
                title='<br>Network graph made with Python',
                titlefont=dict(size=16),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="Python code: <a href='https://plot.ly/ipython-notebooks/network-graphs/'> https://plot.ly/ipython-notebooks/network-graphs/</a>",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False)))

iplot(fig, filename='networkx')
In [ ]: